# Reading in the data
initial_data <- read.csv("Data_Econ_index.csv", na.strings=c("","NA"))
#removing top two rows
initial_data = initial_data[-c(1,2),]
initial_data$primary.location <- str_replace_all(initial_data$primary.location, "Yerevan", "Yerevan, Armenia")
initial_data$primary.location <- str_replace_all(initial_data$primary.location, "Yerevan, Armenia, Armenia", "Yerevan, Armenia")
initial_data$funding <- str_replace_all(initial_data$funding, "N/A", "Unknown")
initial_data$X..of.founders <- str_replace_all(initial_data$X..of.founders, " N/A", "Unknown")
initial_data$X..of.founders <- str_replace_all(initial_data$X..of.founders, "N/A", "Unknown")
initial_data$X..of.founders <- str_replace_all(initial_data$X..of.founders, "4\\(\\?\\)", "4")
# Changing the missing data to unknown
data_unknowns <- initial_data %>%
mutate(primary.location = fct_explicit_na(initial_data$primary.location, na_level = "Unknown")) %>%
mutate(accelerator.incubator = fct_explicit_na(initial_data$accelerator.incubator, na_level = "Unknown")) %>%
mutate(current.stage = fct_explicit_na(initial_data$current.stage, na_level = "Unknown")) %>%
mutate(funding = fct_explicit_na(initial_data$funding, na_level = "Unknown")) %>%
mutate(date.published = fct_explicit_na(initial_data$date.published, na_level = "Unknown")) %>%
mutate(X..of.founders = fct_explicit_na(initial_data$X..of.founders, na_level = "Unknown")) %>%
select (-c(date.range.for.government.support))
yearExtract <- function(string) {
t <- regmatches(string, regexec("[0-9]{4}", string))
sapply(t, function(x) {
if(length(x) > 0){
return(as.numeric(x))
} else {
return(NA)
}
})
}
# Changing date published data to be only in years format
data_unknowns$date.published <- yearExtract(as.character(data_unknowns$date.published))
yrs <- data_unknowns$date.published
yr <- as.Date(as.character(yrs), format = "%Y")
data_unknowns$date.published <- year(yr)
# Changing NAs to be Unknown
data_unknowns$date.published[is.na(data_unknowns$date.published)] <- 'Unknown'
# Making the date as a counted observation in my df
data_unknowns2 <- transform(data_unknowns, count = table(date.published)[date.published])
# Removing unknown dates and grouping by variables of interest
walk <- data_unknowns2 %>%
filter(date.published != "Unknown") %>%
group_by(interaction(current.stage, date.published))
# Grouping again
walk2 <- transform(walk, count = table(interaction(current.stage, date.published))[interaction(current.stage, date.published)])
# Keeping only distinct variables
walk2 <- distinct(walk2, count.Var1, .keep_all = TRUE)
# Creating Streamgraph, failing at getting labels :(
walk2 %>%
streamgraph("current.stage", "count.Freq.1", "date.published") %>%
sg_fill_manual(c("#ffa500", "blue", "purple", "red", "#00ff00", "red")) %>%
sg_legend(show=TRUE, label="Phase of Startup")%>%
sg_axis_x(1, "year", "%Y") %>%
sg_title(title = "Phases of Startups in Armenia 2016-2020") %>%
sg_annotate(label="Vermont", x=as.Date("2016-04-01"), y=0.6, color="purple")
Phases of Startups in Armenia 2016-2020
#data_unknowns2
fig <- plot_ly(
type = "sankey",
orientation = "h",
node = list(
label = c("Armenia Startup Academy", "AUA EPIC", "Hero House AI Incubator", "STEP EIF", "Unknown"),
color = c("blue", "blue", "blue", "blue", "blue", "blue"),
pad = 15,
thickness = 20,
line = list(
color = "black",
width = 0.5
)
),
link = list(
source = c(0,1,2,3,4,5),
target = c(1,2,3,4,4,5),
value = c(8,4,2,8,4,2)
)
)
fig <- fig %>% layout(
title = "Basic Sankey Diagram",
font = list(
size = 10
)
)
fig
fig <- plot_ly(
type = "sankey",
orientation = "h",
node = list(
label = c("A1", "A2", "B1", "B2", "C1", "C2"),
color = c("blue", "blue", "blue", "blue", "blue", "blue"),
pad = 15,
thickness = 20,
line = list(
color = "black",
width = 0.5
)
),
link = list(
source = c(0,1,0,2,3,3),
target = c(2,3,3,4,4,5),
value = c(8,4,2,8,4,2)
)
)
fig <- fig %>% layout(
title = "Basic Sankey Diagram",
font = list(
size = 10
)
)
fig